In [2]:
%matplotlib notebook

import itertools
import logging
from functools import partial

import gensim
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE

from knub.thesis.util import *
matplotlib.style.use('ggplot')

In [10]:
pnd.set_option("display.max_colwidth", 100)

Topic Models → Topic Coherence, Concept Categorization

Evaluated using Palmetto tool from Exploring the Space of Topic Coherence Measures paper Values still seem low compared to example values from the paper


In [4]:
df_tc_results = pnd.DataFrame([
        ("topic.full.alpha-1-100.256-400.model", 0.469500859375, 0.00617111859067, 0.6463414634146342),
        ("topic.16-400.model", 0.43805875, 0.00390183951094, 0.5975609756097561),
        ("topic.256-1000.model", 0.473455351563, 0.00635883046394, 0.5853658536585366),
        ("topic.64-400.model", 0.45327734375, 0.00385141007263, 0.6341463414634146),
        ("topic.256-400.model", 0.46836359375, 0.00599032492068, 0.5731707317073171),
        ("topic.full.fixed-vocabulary.alpha-1-100.256-400.model", 0.468437070312, 0.00562772603243, 0.5975609756097561),
        ("topic.full.256-400.model", 0.472498945313, 0.00624853749772, 0.5975609756097561),
        ("topic.256-600.model", 0.478640273437, 0.00685787139094, 0.5609756097560975)
    ], 
    columns=["Topic model parameters", "TC_mean", "TC_var", "CC_purity"])
del df_tc_results["CC_purity"]

In [5]:
df_tc_results.sort_values(by="TC_mean", ascending=False)


Out[5]:
Topic model parameters TC_mean TC_var
7 topic.256-600.model 0.478640 0.006858
2 topic.256-1000.model 0.473455 0.006359
6 topic.full.256-400.model 0.472499 0.006249
0 topic.full.alpha-1-100.256-400.model 0.469501 0.006171
5 topic.full.fixed-vocabulary.alpha-1-100.256-40... 0.468437 0.005628
4 topic.256-400.model 0.468364 0.005990
3 topic.64-400.model 0.453277 0.003851
1 topic.16-400.model 0.438059 0.003902

In [7]:
df_tc_results.sort_values(by="TC_var", ascending=False)


Out[7]:
Topic model parameters TC_mean TC_var
7 topic.256-600.model 0.478640 0.006858
2 topic.256-1000.model 0.473455 0.006359
6 topic.full.256-400.model 0.472499 0.006249
0 topic.full.alpha-1-100.256-400.model 0.469501 0.006171
4 topic.256-400.model 0.468364 0.005990
5 topic.full.fixed-vocabulary.alpha-1-100.256-40... 0.468437 0.005628
1 topic.16-400.model 0.438059 0.003902
3 topic.64-400.model 0.453277 0.003851

In [12]:
df_tc_results_2 = pnd.read_csv("../models/topic_models_coherence_2.tsv", sep="\t", index_col=None)
df_tc_results_2.sort_values(by="TC_mean", ascending=False)


Out[12]:
model TC_mean TC_var
8 topic.256-400.first-2000.alpha-0-1.beta-0-1.model.ssv 0.495 0.095
5 topic.256-400.first-2000.alpha-0-01.beta-0-1.model.ssv 0.494 0.093
2 topic.256-400.first-2000.alpha-0-002.beta-0-1.model.ssv 0.478 0.084
7 topic.256-400.first-2000.alpha-0-1.beta-0-01.model.ssv 0.476 0.086
4 topic.256-400.first-2000.alpha-0-01.beta-0-01.model.ssv 0.475 0.083
6 topic.256-400.first-2000.alpha-0-1.beta-0-002.model.ssv 0.475 0.083
0 topic.256-400.first-2000.alpha-0-002.beta-0-002.model.ssv 0.470 0.079
1 topic.256-400.first-2000.alpha-0-002.beta-0-01.model.ssv 0.470 0.079
3 topic.256-400.first-2000.alpha-0-01.beta-0-002.model.ssv 0.469 0.079
9 embedding.model.skip-gram.ssv 0.466 0.123
10 embedding.model.cbow.ssv 0.433 0.067

Word Embeddings → Analogy Reasoning

Using manual set parameters

Using the question word data set (~19k questions) from Efficient Estimation of Word Representations in Vector Space (word2vec).


In [8]:
df_ar_results = pnd.DataFrame([
        ("embedding.skip-gram.size-200.window-5.negative-5.model", 0.481221858371),
        ("embedding.cbow.size-200.window-5.model", 0.416547277937),
        ("embedding.google.size-300", 0.735878018829),
    ], 
    columns=["Word Embeddings", "Analogy_Reasoning"])

df_ar_results.sort_values(by="Analogy_Reasoning", ascending=False)


Out[8]:
Word Embeddings Analogy_Reasoning
2 embedding.google.size-300 0.735878
0 embedding.skip-gram.size-200.window-5.negative... 0.481222
1 embedding.cbow.size-200.window-5.model 0.416547

Using Spearmint

Testing only skip-gram architecture.


In [12]:
df_ar_spearmint_results = pnd.read_csv("../code/python/knub/thesis/spearmint_analogy_reasoning/results.csv", index_col="model")
df_ar_spearmint_results.sort_values(by="Analogy_Reasoning", ascending=False)


Out[12]:
sample window negative size Analogy_Reasoning
model
4 0.010000 8 18 476 0.713262
5 0.000004 6 20 600 0.712955
3 0.005000 5 12 325 0.707941
1 0.000000 3 5 50 0.384977